import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LogisticRegression, Ridge, RidgeCV
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.model_selection import KFold, cross_val_score,train_test_split, GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import VotingRegressor, GradientBoostingRegressor, BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from bayes_opt import BayesianOptimization
import itertools as it
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from pyearth import Earth
import warnings
from six import StringIO
from IPython.display import Image
import pydotplus
import time as time
import math
import warnings
warnings.filterwarnings("ignore")Project Code
Saturn
1 Data quality check / cleaning / preparation
Put code with comments. The comments should explain the code such that it can be easily understood. You may put text (in a markdown cell) before a large chunk of code to explain the overall purpose of the code, if it is not intuitive. Put the name of the person / persons who contributed to each code chunk / set of code chunks. An example is given below.
1.1 Data cleaning
By Anastasia Wei
# read in the red and white wine files
red = pd.read_csv('Data/winequality-red.csv', delimiter = ";")
white = pd.read_csv('Data/winequality-white.csv', delimiter = ";")
red['type'] = 'red'
white['type'] = 'white'
# combine the red and white wine data
all_df = pd.concat([red, white], ignore_index = True)
all_df.head()| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | red |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | red |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | red |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | red |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | red |
1.2 Distribution of response
By Anastasia Wei
# Mean and standard deviation of response
print('Response mean =', np.average(all_df.quality))
print('Response standard deviation =', np.std(all_df.quality))Response mean = 5.818377712790519
Response standard deviation = 0.8731880644450432
# response distribution
plt.hist(red.quality.values, bins = np.arange(1,10), color = 'r', alpha = 0.8, zorder = 5, label = 'red')
plt.title('Wine Quality Distribution', fontsize=14)
plt.hist(white.quality.values, bins = np.arange(1,10), color = 'grey', alpha = 0.5, label = 'white')
plt.legend()
plt.xlabel('Wine Quality', fontsize=14)
plt.ylabel('Count', fontsize=14);1.3 Data preparation
By Amy Wang
The following data preparation steps helped us to prepare our data for implementing various modeling / validation techniques:
- Turn
typeinto dummy variables to make all our variables numerical. - Scale the predictors so that regression models treat each predictor with equal weight.
- Split the data into test (20%) and train (80%) datasets
######---------------Turn the type predictor into dummy variables----------------#########
X = all_df.drop("quality", axis = 1)
y = all_df.quality
X_dummy = pd.get_dummies(X)
X_dummy.head()| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | type_red | type_white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 1 | 0 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 1 | 0 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 1 | 0 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 1 | 0 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 1 | 0 |
######---------------Scaling the predictors----------------#########
scaler = MinMaxScaler()
X_dummy_sca = pd.DataFrame(scaler.fit_transform(X_dummy), columns = X_dummy.columns)
X_dummy_sca.head()| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | type_red | type_white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.297521 | 0.413333 | 0.000000 | 0.019939 | 0.111296 | 0.034722 | 0.064516 | 0.206092 | 0.612403 | 0.191011 | 0.202899 | 1.0 | 0.0 |
| 1 | 0.330579 | 0.533333 | 0.000000 | 0.030675 | 0.147841 | 0.083333 | 0.140553 | 0.186813 | 0.372093 | 0.258427 | 0.260870 | 1.0 | 0.0 |
| 2 | 0.330579 | 0.453333 | 0.024096 | 0.026074 | 0.137874 | 0.048611 | 0.110599 | 0.190669 | 0.418605 | 0.241573 | 0.260870 | 1.0 | 0.0 |
| 3 | 0.611570 | 0.133333 | 0.337349 | 0.019939 | 0.109635 | 0.055556 | 0.124424 | 0.209948 | 0.341085 | 0.202247 | 0.260870 | 1.0 | 0.0 |
| 4 | 0.297521 | 0.413333 | 0.000000 | 0.019939 | 0.111296 | 0.034722 | 0.064516 | 0.206092 | 0.612403 | 0.191011 | 0.202899 | 1.0 | 0.0 |
## Quantile-Quantile (Q-Q) plots for all predictors
np.random.seed(1)
fig = sm.qqplot(X_dummy.iloc[:,0], line='45')
fig2 = sm.qqplot(X_dummy.iloc[:,1], line='45')
fig3 = sm.qqplot(X_dummy.iloc[:,2], line='45')
fig4 = sm.qqplot(X_dummy.iloc[:,3], line='45')
fig5 = sm.qqplot(X_dummy.iloc[:,4], line='45')
fig6 = sm.qqplot(X_dummy.iloc[:,5], line='45')
fig7 = sm.qqplot(X_dummy.iloc[:,6], line='45')
fig8 = sm.qqplot(X_dummy.iloc[:,7], line='45')
fig9 = sm.qqplot(X_dummy.iloc[:,8], line='45')
fig10 = sm.qqplot(X_dummy.iloc[:,9], line='45')
fig11 = sm.qqplot(X_dummy.iloc[:,10], line='45')
fig12 = sm.qqplot(X_dummy.iloc[:,11], line='45')
fig13 = sm.qqplot(X_dummy.iloc[:,12], line='45')
plt.show()######-----Splitting the data into test and train datasets-------#########
X_train, X_test, y_train, y_test = train_test_split(X_dummy_sca, y, test_size = 0.2, random_state = 45)2 Exploratory data analysis
By Kaitlyn Hung
# Correlations of predictors with response variable
all_df.corrwith(all_df.quality).sort_values()density -0.305858
volatile acidity -0.265699
chlorides -0.200666
fixed acidity -0.076743
total sulfur dioxide -0.041385
residual sugar -0.036980
pH 0.019506
sulphates 0.038485
free sulfur dioxide 0.055463
citric acid 0.085532
alcohol 0.444319
quality 1.000000
dtype: float64
# Pairplot of predictors and response to search for interesting trends and variable relationships
sns.pairplot(data = all_df)<seaborn.axisgrid.PairGrid at 0x7ffbe100ddf0>
# Defining function to make exploratory plots
def plot_dist(var):
fig, axes = plt.subplots(1,2, figsize = (10, 4))
plt.subplots_adjust(wspace = 0.2)
# histogram to visualize distribution and barplot to visualize relationship to response
sns.histplot(x = var, data = all_df, ax = axes[0])
sns.barplot(x = "quality", y = var, data = all_df, ax = axes[1])# Clear trend that lower volatile acidity values have better quality
plot_dist("volatile acidity")# Clear relationship with response: higher alcohol content is related to improved quality
plot_dist("alcohol")# higher citric acid content appears to relate to higher quality
plot_dist("citric acid")# lower chloride content appears to relate to improved quality
plot_dist("chlorides")# No clear relationship between density and quality
plot_dist("density")# No clear relationship between pH and qualtiy
plot_dist("pH")# No clear relationship between fixed acidity and quality
plot_dist("fixed acidity")# No clear relationship between total SO2 and quality.
#The distribution of total SO2 is bimodal with peaks around 25 and 125 units.
plot_dist("total sulfur dioxide")# No clear relationship between sulphates and quality
plot_dist("sulphates")# No clear relationship between residual sugar and quality
# The distribution of residual sugar appears to be skewed
plot_dist("residual sugar")# No clear relationship between free SO2 and quality
plot_dist("free sulfur dioxide")3 Developing the model: Hyperparameter tuning
Put code with comments. The comments should explain the code such that it can be easily understood. You may put text (in a markdown cell) before a large chunk of code to explain the overall purpose of the code, if it is not intuitive. Put the name of the person / persons who contributed to each code chunk / set of code chunks.
Put each model in a section of its name and mention the name of the team-member tuning the model. Below is an example:
3.1 Intercept Base Model
By Anastasia Wei
# predicting the response as the mean of the train response
pred = [np.around(np.mean(y_train))]*len(y_test)# test RMSE
np.sqrt(mean_squared_error(y_test, pred))0.9327379053088815
3.2 Ridge and Lasso Regression
By Anastasia Wei
### Ridge Regression
# defining regulization parameter space
alphas = 10**np.linspace(1.5,-3,200)*0.5
ridgecv = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error', store_cv_values = True)
ridgecv.fit(X_train, y_train)
print("Optimal alpha = ", ridgecv.alpha_)
# using the developed ridge regression model to predict on test & train data
ridge = Ridge(alpha = ridgecv.alpha_)
ridge.fit(X_train, y_train)
pred = ridge.predict(X_test)
print('Train RMSE = ', np.sqrt(mean_squared_error(y_train, np.around(ridge.predict(X_train)))))
print('Test RMSE = ', np.sqrt(mean_squared_error(y_test, np.around(pred))))Optimal alpha = 0.01636987568069109
Train RMSE = 0.7892143827427598
Test RMSE = 0.8166535844059444
#### Lasso Regression
# Searching through regularizatio parameter space
alphas = 10**np.linspace(2,-5,200)*0.5
rmses = []
for a in alphas:
lasso = Lasso(alpha = a)
lasso.fit(X_train, y_train)
pred = lasso.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, np.around(pred)))
rmses.append(rmse)# Visualizing rmse vs alpha
plt.scatter(alphas, rmses, s = 3)
plt.ylabel('RMSE')
plt.xlabel('alpha')
plt.xscale('log')print('Optimal alpha =', alphas[np.array(rmses).argmin()])
print('Optimal train RMSE = ', rmses[np.array(rmses).argmin()])Optimal alpha = 0.0004301732208342255
Optimal train RMSE = 0.7806343783815696
lasso = Lasso(alpha = 0.0004301732208342255)
pred = lasso.fit(X_train, y_train).predict(X_test)
print('Test RMSE = ', np.sqrt(mean_squared_error(y_test, np.around(pred))))Test RMSE = 0.8138228875545909
3.3 MARS
By Lila Wells
Beginning with the original coarse grid search (optimizing max_depth and max_degree simultaneously in a nested for loop)
# Coarse grid search for hyperparameter optimization
# Note: I fit 25 models here and it took approximately an hour and a half (90 mins)
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category = FutureWarning)
init_search_df = pd.DataFrame(columns = ['degree', 'max_terms', 'rmse'])
# Lists to store the values for plotting
degrees = []
max_terms_values = []
mean_scores = []
# Initializing optimal parameters & best score variables
opt_degree = 1
opt_max_terms = 500
best_score = -float('inf')
# Outer loop for degree
for degree in range(1, 11, 2):
# print('Fitting models for degree = ', degree)
# Creating a MARS model with the current degree and max_terms
model = Earth(max_terms = 500, max_degree = degree)
# Inner loop for max_terms
for terms in range(400, 1201, 200):
# print(' Fitting models for degree = ', degree, 'and max_terms = ', terms)
# Setting the current max_terms
model.max_terms = terms
# 5-fold cross validation
scores = cross_val_score(model,
X_train,
y_train,
cv = 5,
scoring = 'neg_root_mean_squared_error')
# Computing mean score
mean_score = scores.mean()
# Saving the values for plotting
degrees.append(degree)
max_terms_values.append(terms)
mean_scores.append(mean_score)
# Checking if mean score is better than the current best score
if mean_score > best_score:
best_score = mean_score
opt_degree = degree
opt_max_terms = terms
# Printing the optimal hyperparameters
print('Optimal max_terms in coarse grid search = ', opt_max_terms)
print('Optimal max_degree in coarse grid search = ', opt_degree) Optimal max_terms in coarse grid search = 400
Optimal max_degree in coarse grid search = 5
### Defining the initial MARS model and making predictions on test data to compute RMSE
# Defining the MARS model
mars_init = Earth(max_terms = 400, max_degree = 5).fit(X_train, y_train)
# Making predictions & rounding them to the nearest integer
pred_mars_init = np.around(mars_init.predict(X_test))
# Printing the coarse grid RMSE
print("Coarse Grid Tuning MARS Model Test RMSE:", np.sqrt(mean_squared_error(y_test, pred_mars_init)))Coarse Grid Tuning MARS Model Test RMSE: 1.5657389506359167
Attempting to further optimize the MARS model with a finer grid search over a narrower range of hyperparameters (4-7 for max_degree and 300-800 for max_terms).
# Finer grid search
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category = FutureWarning) # Got 7 and 800
# Initializing optimal parameters & best score variables
opt_degree = 1
opt_max_terms = 500
best_score = -float('inf')
# Outer loop for degree
for degree in [4, 5, 6, 7]:
# print('Fitting models for degree = ', degree)
# Creating a MARS model with the current degree and max_terms
model = Earth(max_terms = 500, max_degree = degree)
# Inner loop for max_terms
for terms in range(300, 801, 100):
# print(' Fitting models for degree = ', degree, 'and max_terms = ', terms)
# Setting the current max_terms
model.max_terms = terms
# 5-fold cross validation
scores = cross_val_score(model,
X_train,
y_train,
cv = 5,
scoring = 'neg_root_mean_squared_error')
# Computing mean score
mean_score = scores.mean()
# Checking if mean score is better than the current best score
if mean_score > best_score:
best_score = mean_score
opt_degree = degree
opt_max_terms = terms
print("Optimal degree in fine grid search = ", degree)
print("Optimal max_terms in fine grid search = ", terms)Optimal degree in fine grid search = 7
Optimal max_terms in fine grid search = 800
### Defining the final MARS model and making predictions on test data to compute RMSE
# Defining the MARS model
mars_final = Earth(max_terms = 800, max_degree = 7).fit(X_train, y_train)
# Making predictions & rounding them to the nearest integer
pred_mars_final = np.around(mars_final.predict(X_test))
# Printing the coarse grid RMSE
print("Fine Grid Tuning MARS Model Test RMSE:", np.sqrt(mean_squared_error(y_test, pred_mars_final)))Fine Grid Tuning MARS Model Test RMSE: 0.790326125479466
The next step here was to predict the model’s train data residuals, and build a model to predict those residuals (in an effort to improve the MARS model).
# Making predictions with the optimized MARS model
prediction_mars = np.around(mars_final.predict(X_train)) # Rounding them to the nearest integer
# Converting the predictions to a dataframe
prediction_mars = pd.Series(prediction_mars, name='quality')
prediction_mars = prediction_mars.to_frame().rename(columns={'quality': 'quality'})
# Calculate residuals
residuals = y_train - prediction_mars
# Plotting residuals
df = pd.concat([prediction_mars, residuals], axis=1)
df.columns = ['quality', 'residuals']
# Jittering the points
jitter_amount = 0.3
df['jittered_residuals'] = df['residuals'] + np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(df))
# Plotting the points
residual_mars_plt = df.plot(x='quality', y='jittered_residuals', kind='scatter')
residual_mars_plt.axhline(y=0, color='red', linestyle='--') # Add line y = 0
residual_mars_plt.set_xlabel('Predicted Quality Values')
residual_mars_plt.set_ylabel('Residuals')
residual_mars_plt.set_title('Residual Plot')
residual_mars_plt.set_xlim([0, 10])
residual_mars_plt.set_ylim([-5, 5])
residual_mars_plt.set_yticks([-5, 0, 5])Now creating a model to predict residuals of the MARS model.
# Fitting a MARS model to predict residuals
residual_model = Earth()
residual_model.fit(prediction_mars, residuals)
# Make predictions on test set
residuals_pred = residual_model.predict(pred_mars_final) # These predictions are the MARS test predictionsAdding the residual predictions to the test
# Making predictions with the residuals model
# Rounding predictions to the nearest integer
residuals_pred = np.around(residual_model.predict(pred_mars_final))
residuals_pred_series = pd.Series(residuals_pred, name='quality') # Making residuals into a series
pred_mars_final = pd.Series(pred_mars_final, name='quality') # Turning the test predictions into a Series
# Add predicted residuals to y_pred to get final predictions
y_pred_final = pd.concat([pred_mars_final, residuals_pred_series], axis=1).sum(axis=1)
# Calculating the final MARS model RMSE
print("Test RMSE of Fine Grid MARS Model & Residuals Model:", np.sqrt(mean_squared_error(y_test, y_pred_final)))Test RMSE of Fine Grid MARS Model & Residuals Model: 0.7775701798650618
3.4 Decision Tree
By Kaitlyn Hung
Getting a sense of the base RMSE for a tree model and deciding hyperparameter value ranges to consider for tuning.
# Developing model
tree_model = DecisionTreeRegressor(random_state=45)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))RMSE: 0.8507914867029135
tree_model.get_depth()27
tree_model.get_n_leaves()1469
Used the depth and number of leaves as the max hyperparameter values for a coarse grid search
coarse_grid = {
'max_depth': np.arange(2,27, 5),
'max_leaf_nodes': np.arange(100, 1500, 250),
'min_samples_leaf': np.arange(1,10,2)
}
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=45), coarse_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose = True)
grid_search.fit(X_train, y_train)
print('Best MSE Through Grid Search : %.3f'%grid_search.best_score_)
print(grid_search.best_params_)Fitting 5 folds for each of 150 candidates, totalling 750 fits
Best MSE Through Grid Search : -0.531
{'max_depth': 7, 'max_leaf_nodes': 100, 'min_samples_leaf': 9}
# Plotting results
cv_results = pd.DataFrame(grid_search.cv_results_)
fig, axes = plt.subplots(1,3,figsize=(14,5))
plt.subplots_adjust(wspace=0.2)
axes[0].plot(cv_results.param_max_depth, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('K-fold MSE')
axes[1].plot(cv_results.param_max_leaf_nodes, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('Leaves')
axes[1].set_ylabel('K-fold MSE');
axes[2].plot(cv_results.param_min_samples_leaf, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('min samples')
axes[2].set_ylabel('K-fold MSE');Using the optimal values and visualizations to adjust the ranges of values considered for a fine grid search.
fine_grid = {
'max_depth': range(3,12),
'max_leaf_nodes': np.arange(2, 127, 25),
'min_samples_leaf': [8, 9, 10]
}
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=45), fine_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose = True)
grid_search.fit(X_train, y_train)
print('Best MSE Through Grid Search : %.3f'%grid_search.best_score_)
print(grid_search.best_params_)Fitting 5 folds for each of 135 candidates, totalling 675 fits
Best MSE Through Grid Search : -0.522
{'max_depth': 6, 'max_leaf_nodes': 52, 'min_samples_leaf': 8}
# Plotting results
cv_results = pd.DataFrame(grid_search.cv_results_)
fig, axes = plt.subplots(1,3,figsize=(14,5))
plt.subplots_adjust(wspace=0.2)
axes[0].plot(cv_results.param_max_depth, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('K-fold MSE')
axes[1].plot(cv_results.param_max_leaf_nodes, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('Leaves')
axes[1].set_ylabel('K-fold MSE');
axes[2].plot(cv_results.param_min_samples_leaf, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('min samples')
axes[2].set_ylabel('K-fold MSE');Tree model with optimal hyperparameter values:
tree_model = DecisionTreeRegressor(random_state=45, max_leaf_nodes = 52, max_depth = 6, min_samples_leaf = 8) .fit(X_train, y_train)# function to round predictions to nearest integer
def round_half_up(n, decimals=0):
multiplier = 10 ** decimals
return math.floor(n*multiplier + 0.5) / multiplier# Making predictions then rounding to nearest integer
raw_test_pred = tree_model.predict(X_test)
test_pred = np.array([round_half_up(xi) for xi in raw_test_pred])# Test RMSE for decision tree model
print("Decision Tree Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred)))Decision Tree Test RMSE: 0.8507914867029135
3.5 Bagging Decision Tree
By Lila Wells
First, identifying the optimal number of trees to use by visualizing the number of trees vs. R-squared
# Finding model accuracy vs number of trees
oob_rsquared = {}; test_rsquared = {}; oob_rmse = {}; test_rmse = {}
# Iterating over tree values
for i in range(100, 1201, 50):
model = BaggingRegressor(base_estimator = DecisionTreeRegressor(random_state = 1),
n_estimators = i,
random_state = 1,
n_jobs=-1,
oob_score = True).fit(X_train, y_train)
oob_rsquared[i] = model.oob_score_ # Returns the out-of_bag R-squared of the model
test_rsquared[i] = model.score(X_test, y_test) # Returns the test R-squared of the model
oob_rmse[i] = np.sqrt(mean_squared_error(model.oob_prediction_, y_train))
test_rmse[i] = np.sqrt(mean_squared_error(model.predict(X_test), y_test))# Plotting results
fig, axes = plt.subplots(1,2,figsize=(14,5))
plt.subplots_adjust(wspace= 0.3)
axes[0].plot(oob_rsquared.keys(),oob_rsquared.values(),label = 'Out of bag R-squared')
axes[0].plot(oob_rsquared.keys(),oob_rsquared.values(),'o',color = 'blue')
axes[0].plot(test_rsquared.keys(),test_rsquared.values(), label = 'Test data R-squared')
axes[0].set_title('Number of Trees vs. R-Squared')
axes[0].set_xlabel('Number of trees')
axes[0].set_ylabel('Rsquared')
axes[0].legend()
axes[1].plot(oob_rmse.keys(),oob_rmse.values(),label = 'Out of bag RMSE')
axes[1].plot(oob_rmse.keys(),oob_rmse.values(),'o',color = 'blue')
axes[1].plot(test_rmse.keys(),test_rmse.values(), label = 'Test data RMSE')
axes[1].set_title('Number of Trees vs. RMSE')
axes[1].set_xlabel('Number of trees')
axes[1].set_ylabel('RMSE')
axes[1].legend()<matplotlib.legend.Legend at 0x12e8658e0>
From this, it looks like any number of trees past 500 seems to stabilize the test RMSE.
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category = FutureWarning)
# Optimizing hyperparameters
n_samples = X_train.shape[0]
n_features = X_train.shape[1]
params = {'estimator': [DecisionTreeRegressor(random_state = 1),
DecisionTreeRegressor(random_state = 1, max_depth = 6),
DecisionTreeRegressor(random_state = 1, max_depth = 10)],
'n_estimators': range(500, 1001, 100),
'max_samples': [0.5, 0.75, 1.0],
'max_features': [0.5, 1.0],
'bootstrap': [True, False],
'bootstrap_features': [True, False]}
# 2-fold cross validation to minimize runtime
cv = KFold(n_splits = 2, shuffle = True, random_state = 1)
bagging_grid = GridSearchCV(BaggingRegressor(random_state = 1, n_jobs = -1),
param_grid = params,
cv = cv,
scoring = "neg_root_mean_squared_error",
n_jobs = -1,
verbose = 1)
bagging_grid.fit(X_train, y_train.values.ravel())
print('Best Score Through Grid Search : %.3f'%bagging_grid.best_score_)
print('Best Parameters : ', bagging_grid.best_params_)Fitting 2 folds for each of 144 candidates, totalling 288 fits
Best R^2 Score Through Grid Search : -0.639
Best Parameters : {'bootstrap': False, 'bootstrap_features': True, 'estimator': DecisionTreeRegressor(random_state=1), 'max_features': 1.0, 'max_samples': 0.75, 'n_estimators': 1000}
Visualizing the results of the hyperparameter search
# Plotting results
cv_results = pd.DataFrame(bagging_grid.cv_results_)
fig, axes = plt.subplots(1,3,figsize=(14,5))
plt.subplots_adjust(wspace= 0.3)
axes[0].plot(cv_results.param_n_estimators, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('N_Estimators')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_max_samples -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('Max_Samples')
axes[1].set_ylabel('K-fold RMSE');
axes[2].plot(cv_results.param_max_features, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('Max_Features')
axes[2].set_ylabel('K-fold RMSE');Defining the initial base model.
# Defining the initial base model
bagging_init = BaggingRegressor(random_state = 1,
n_jobs = -1,
estimator = DecisionTreeRegressor(random_state = 1),
n_estimators = 900,
max_samples = 0.75,
max_features = 1.0,
bootstrap_features = True,
bootstrap = False)
bagging_init.fit(X_train, y_train)BaggingRegressor(bootstrap=False, bootstrap_features=True,
estimator=DecisionTreeRegressor(random_state=1),
max_samples=0.75, n_estimators=900, n_jobs=-1, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BaggingRegressor(bootstrap=False, bootstrap_features=True,
estimator=DecisionTreeRegressor(random_state=1),
max_samples=0.75, n_estimators=900, n_jobs=-1, random_state=1)DecisionTreeRegressor(random_state=1)
DecisionTreeRegressor(random_state=1)
Then, making predictions on test data to identify the test RMSE of this bagging model (optimized via a coarse grid search of hyperparameters).
# Making predictions
pred_init = np.around(bagging_init.predict(X_test)) # Rounding predictions to the nearest integer
# Printing the test RMSE of the coarse grid tuning model
print("Coarse Grid Tuning Bagging Model Test RMSE:", np.sqrt(mean_squared_error(y_test, pred_init)))Coarse Grid Tuning Bagging Model Test RMSE: 0.6673714223613529
Now, implementing a finer grid search of hyperparameters in an attempt to lower the bagging model’s RMSE.
# Finer grid search
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action = 'ignore', category = FutureWarning)
# Optimizing hyperparameters
n_samples = X_train.shape[0]
n_features = X_train.shape[1]
params = {'estimator': [DecisionTreeRegressor(random_state = 1)],
'n_estimators': range(800, 1101, 60), # Narrowing this search space based on coarse grid results
'max_samples': [0.6, 0.75, 0.9], # Narrowing this search space based on coarse grid results
'max_features': [0.5, 0.75, 0.85, 1.0], # Lowering this search space based on coarse grid resutlss
'bootstrap': [True, False],
'bootstrap_features': [True, False]}
# 2-fold cross validation to minimize runtime
cv = KFold(n_splits = 2, shuffle = True, random_state = 1)
# Using GridSearchCV here
bagging_grid = GridSearchCV(BaggingRegressor(random_state = 1, n_jobs = -1),
param_grid = params,
cv = cv,
scoring = "neg_mean_squared_error",
n_jobs = -1,
verbose = 1)
# Fitting the bagging model
bagging_grid.fit(X_train, y_train.values.ravel())Fitting 2 folds for each of 288 candidates, totalling 576 fits
Best Score Through Grid Search : -0.407
Best Parameters : {'bootstrap': False, 'bootstrap_features': False, 'estimator': DecisionTreeRegressor(random_state=1), 'max_features': 0.75, 'max_samples': 0.75, 'n_estimators': 1040}
# Printing the best score and parameters
print('Best Score Through Grid Search : %.3f'%bagging_grid.best_score_)
print('Best Parameters : ', bagging_grid.best_params_)Best Score Through Grid Search : -0.407
Best Parameters : {'bootstrap': False, 'bootstrap_features': False, 'estimator': DecisionTreeRegressor(random_state=1), 'max_features': 0.75, 'max_samples': 0.75, 'n_estimators': 1040}
# Plotting results
cv_results = pd.DataFrame(bagging_grid.cv_results_)
fig, axes = plt.subplots(1,3,figsize=(14,5))
plt.subplots_adjust(wspace= 0.3)
axes[0].plot(cv_results.param_n_estimators, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('N_Estimators')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_max_samples -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('Max_Samples')
axes[1].set_ylabel('K-fold RMSE');
axes[2].plot(cv_results.param_max_features, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('Max_Features')
axes[2].set_ylabel('K-fold RMSE');# Defining the optimized base model
bagging_opt = BaggingRegressor(random_state = 1,
n_jobs = -1,
estimator = DecisionTreeRegressor(random_state = 1),
n_estimators = 1040,
max_samples = 0.75,
max_features = 0.75,
bootstrap_features = False,
bootstrap = False)
bagging_opt.fit(X_train, y_train)BaggingRegressor(bootstrap=False,
estimator=DecisionTreeRegressor(random_state=1),
max_features=0.75, max_samples=0.75, n_estimators=1040,
n_jobs=-1, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BaggingRegressor(bootstrap=False,
estimator=DecisionTreeRegressor(random_state=1),
max_features=0.75, max_samples=0.75, n_estimators=1040,
n_jobs=-1, random_state=1)DecisionTreeRegressor(random_state=1)
DecisionTreeRegressor(random_state=1)
# Making predictions
rounded_pred_opt = np.around(bagging_opt.predict(X_test)) # Rounding predictions to the nearest integer
# Printing the test RMSE of the coarse grid tuning model
print("Fine Grid Tuning Bagging Model Test RMSE:", np.sqrt(mean_squared_error(y_test, rounded_pred_opt)))Fine Grid Tuning Bagging Model Test RMSE: 0.665640235470275
3.6 Random Forest
By Amy Wang
Initial tuning of max_features, followed by manual experimentation with n_estimators.
params = {'max_features': range(1, 14, 1)}
rfr_grid = GridSearchCV(RandomForestRegressor(random_state = 1),
params,
n_jobs = -1,
scoring = "neg_root_mean_squared_error",
verbose = 1)
rfr_grid.fit(X_train, y_train.quality)
rfr_grid.best_params_Fitting 5 folds for each of 13 candidates, totalling 65 fits
{'max_features': 3}
cv_results_rf = pd.DataFrame(rfr_grid.cv_results_)
plt.plot(cv_results_rf.param_max_features, -cv_results_rf.mean_test_score, 'o')
plt.xlabel('Features')
plt.ylabel('K-fold RMSE')Text(0, 0.5, 'K-fold RMSE')
rfr_model_coarse = RandomForestRegressor(n_estimators= 900, max_features = 3, random_state = 1, n_jobs = -1).fit(X_train, y_train)
train_rmse = mean_squared_error(y_train, rfr_model_coarse.predict(X_train).round(), squared = False)
test_rmse = mean_squared_error(y_test, rfr_model_coarse.predict(X_test).round(), squared = False)
print("Training RMSE: {}".format(train_rmse))
print("Test RMSE: {}".format(test_rmse))Training RMSE: 0.19170803919425694
Test RMSE: 0.6725382459813659
rfr_model_coarse2 = RandomForestRegressor(n_estimators= 4000, max_features = 3, random_state = 1, n_jobs = -1).fit(X_train, y_train)
train_rmse = mean_squared_error(y_train, rfr_model_coarse2.predict(X_train).round(), squared = False)
test_rmse = mean_squared_error(y_test, rfr_model_coarse2.predict(X_test).round(), squared = False)
print("Training RMSE: {}".format(train_rmse))
print("Test RMSE: {}".format(test_rmse))Training RMSE: 0.18816227403499663
Test RMSE: 0.6592536572635639
rfr_model_coarse3 = RandomForestRegressor(n_estimators= 6000, max_features = 3, random_state = 1, n_jobs = -1).fit(X_train, y_train)
train_rmse = mean_squared_error(y_train, rfr_model_coarse3.predict(X_train).round(), squared = False)
test_rmse = mean_squared_error(y_test, rfr_model_coarse3.predict(X_test).round(), squared = False)
print("Training RMSE: {}".format(train_rmse))
print("Test RMSE: {}".format(test_rmse))Training RMSE: 0.18918213122865765
Test RMSE: 0.6604194471348085
The best model has the parameters: {n_estimators = 4000, max_features = 3}. Holding these parameters constant, I tune max_depth and max_samples.
params = {'n_estimators': [4000],
'max_features': range(1, 14, 2),
'max_depth': [3],
'max_samples': range(1, X_train.shape[0], 500)}
rfr_grid_broad = RandomizedSearchCV(RandomForestRegressor(random_state = 1),
params,
n_jobs = -1,
scoring = "neg_root_mean_squared_error",
verbose = 1)
rfr_grid_broad.fit(X_train, y_train.quality)
rfr_grid_broad.best_params_
# {'n_estimators': 4000, 'max_samples': 501, 'max_features': 9, 'max_depth': 3}train_rmse = mean_squared_error(y_train, rfr_grid_broad.predict(X_train).round(), squared = False)
test_rmse = mean_squared_error(y_test, rfr_grid_broad.predict(X_test).round(), squared = False)
print("Training RMSE: {}".format(train_rmse))
print("Test RMSE: {}".format(test_rmse))Training RMSE: 0.7802645555206809
Test RMSE: 0.7975925314055079
Since the RMSE got significantly worse with this strategy, I retune all four features simultaneously.
params = {'n_estimators': range(100, 5000, 100),
'max_features': range(1, 14, 2),
'max_depth': range(2,30, 2),
'max_samples': range(1, X_train.shape[0], 500)}
rfr_grid_fine = RandomizedSearchCV(RandomForestRegressor(random_state = 1),
params,
n_jobs = -1,
scoring = "neg_root_mean_squared_error",
verbose = 1)
rfr_grid_fine.fit(X_train, y_train.quality)
rfr_grid_fine.best_params_
# Best parameters: {'n_estimators': 1700, 'max_samples': 4001, 'max_features': 7, 'max_depth': 20}cv_results_rf = pd.DataFrame(rfr_grid_fine.cv_results_)
fig, axes = plt.subplots(1, 4, figsize = (14,5))
plt.subplots_adjust(wspace = 0.2)
axes[0].plot(cv_results_rf.param_max_features, -cv_results_rf.mean_test_score, 'o')
axes[0].set_xlabel('Features')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results_rf.param_max_depth, -cv_results_rf.mean_test_score, 'o')
axes[1].set_xlabel('Depth')
axes[1].set_ylabel('K-fold RMSE')
axes[2].plot(cv_results_rf.param_max_samples, -cv_results_rf.mean_test_score, 'o')
axes[2].set_xlabel('Samples')
axes[2].set_ylabel('K-fold RMSE')
axes[3].plot(cv_results_rf.param_n_estimators, -cv_results_rf.mean_test_score, 'o')
axes[3].set_xlabel('Estimators')
axes[3].set_ylabel('K-fold RMSE')Text(0, 0.5, 'K-fold RMSE')
# Manually adjusting based on trends in plots
# {'n_estimators': 1700, 'max_samples': 4001, 'max_features': 7, 'max_depth': 20} - Test RMSE 0.6821910402406465
# {'n_estimators': 3700, 'max_samples': 5000, 'max_features': 7, 'max_depth': 20} - Test RMSE 0.6713934992009014
# {'n_estimators': 5000, 'max_samples': 5000, 'max_features': 7, 'max_depth': 35} - Test RMSE 0.6702467972551374
rfr_model = RandomForestRegressor(n_estimators= 4000, max_features = 7, max_samples = 5000, max_depth = 50, random_state = 1, n_jobs = -1).fit(X_train, y_train)
train_rmse = mean_squared_error(y_train, rfr_model.predict(X_train).round(), squared = False)
test_rmse = mean_squared_error(y_test, rfr_model.predict(X_test).round(), squared = False)
print("Training RMSE: {}".format(train_rmse))
print("Test RMSE: {}".format(test_rmse))Training RMSE: 0.20386868290435214
Test RMSE: 0.6690981300917733
3.7 AdaBoost
By Kaitlyn Hung
Coarse grid search optimizing n_estiamtors, learning_rate, and max_depth of base_estimator:
model = AdaBoostRegressor(random_state=1, base_estimator = DecisionTreeRegressor(random_state = 1))
grid = dict()
grid['n_estimators'] = [10, 50, 100,200]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1, 1.0]
grid['base_estimator__max_depth'] = [3, 5, 10, 15]
cv = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error').fit(X_train, y_train)
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))Best: -0.380224 using {'base_estimator__max_depth': 15, 'learning_rate': 1.0, 'n_estimators': 200}
# Plotting results
cv_results = pd.DataFrame(grid_search.cv_results_)
fig, axes = plt.subplots(1,3,figsize=(14,5))
plt.subplots_adjust(wspace=0.2)
axes[0].plot(cv_results.param_n_estimators, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('n estimators')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_learning_rate, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('learning rate')
axes[1].set_ylabel('K-fold RMSE');
axes[2].plot(cv_results.param_base_estimator__max_depth, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('max_depth')
axes[2].set_ylabel('K-fold RMSE');Performing a finer grid search using the visualizations and optimal values
# Fine search
model = AdaBoostRegressor(random_state=1, base_estimator = DecisionTreeRegressor(random_state = 1))
grid = dict()
grid['n_estimators'] = [200, 500, 1000]
grid['learning_rate'] = [.5, .75, 1.0]
grid['base_estimator__max_depth'] = [12,14,16]
cv = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error').fit(X_train, y_train)
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))Best: -0.377182 using {'base_estimator__max_depth': 14, 'learning_rate': 1.0, 'n_estimators': 1000}
# Plotting results
cv_results = pd.DataFrame(grid_search.cv_results_)
fig, axes = plt.subplots(1,3,figsize=(14,5))
plt.subplots_adjust(wspace=0.2)
axes[0].plot(cv_results.param_n_estimators, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('n estimators')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_learning_rate, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('learning rate')
axes[1].set_ylabel('K-fold RMSE');
axes[2].plot(cv_results.param_base_estimator__max_depth, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('max_depth')
axes[2].set_ylabel('K-fold RMSE');Performing another finer grid search as some of the optimal values are at the edge of the range I considered.
### Finer search
model = AdaBoostRegressor(random_state=1, base_estimator = DecisionTreeRegressor(random_state = 1))
grid = dict()
grid['n_estimators'] = [800, 1000, 1200, 1500]
grid['learning_rate'] = [.75, 1.0, 1.5]
grid['base_estimator__max_depth'] = [13, 14,15, 16]
cv = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error').fit(X_train, y_train)
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))Best: -0.372938 using {'base_estimator__max_depth': 13, 'learning_rate': 1.5, 'n_estimators': 800}
# Plotting results
cv_results = pd.DataFrame(grid_search.cv_results_)
fig, axes = plt.subplots(1,3,figsize=(14,5))
plt.subplots_adjust(wspace=0.2)
axes[0].plot(cv_results.param_n_estimators, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('n estimators')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_learning_rate, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('learning rate')
axes[1].set_ylabel('K-fold RMSE');
axes[2].plot(cv_results.param_base_estimator__max_depth, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('max_depth')
axes[2].set_ylabel('K-fold RMSE');Optimizing just learning_rate and max_depth (not n_estimators)
# Finer search 2
model = AdaBoostRegressor(random_state=1, base_estimator = DecisionTreeRegressor(random_state = 1), n_estimators = 1500)
grid = dict()
grid['learning_rate'] = [1.25, 1.5, 2]
grid['base_estimator__max_depth'] = [12, 13, 14]
cv = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error', verbose = True).fit(X_train, y_train)
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best: -0.373120 using {'base_estimator__max_depth': 13, 'learning_rate': 1.5}
# Plotting results
cv_results = pd.DataFrame(grid_search.cv_results_)
fig, axes = plt.subplots(1,2,figsize=(14,5))
plt.subplots_adjust(wspace=0.2)
axes[0].plot(cv_results.param_learning_rate, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('learning rate')
axes[0].set_ylabel('K-fold RMSE');
axes[1].plot(cv_results.param_base_estimator__max_depth, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('max_depth')
axes[1].set_ylabel('K-fold RMSE');Optimizing n_estimators using other optimal hyperparameter values
# optimizing n estimators
model = AdaBoostRegressor(random_state=1, base_estimator = DecisionTreeRegressor(random_state = 1, max_depth = 13), learning_rate = 1.5)
grid = dict()
grid['n_estimators'] = [1000, 1500, 2000, 3000, 4000]
cv = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error', verbose = True).fit(X_train, y_train)
print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best: -0.372992 using {'n_estimators': 2000}
# Plotting results
cv_results = pd.DataFrame(grid_search.cv_results_)
fig, axes = plt.subplots(1,1,figsize=(7,5))
plt.subplots_adjust(wspace=0.2)
axes.plot(cv_results.param_n_estimators, -cv_results.mean_test_score, 'o')
axes.set_xlabel('n_estimators')
axes.set_ylabel('K-fold MSE');Testing optimal model RMSE:
# tuned hyperparameters
ada_model2000 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=13, random_state=1),
n_estimators=2000,learning_rate=1.5,
random_state=1).fit(X_train,y_train)
# model from the second to last tuning (not optimal hyperparameters)
ada_model1500 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=13, random_state=1),
n_estimators=1500,learning_rate=1.5,
random_state=1).fit(X_train,y_train)# predicting and rounding to int
raw_test_pred2000 = ada_model2000.predict(X_test)
test_pred2000 = np.array([round_half_up(xi) for xi in raw_test_pred2000])
raw_test_pred1500= ada_model1500.predict(X_test)
test_pred1500 = np.array([round_half_up(xi) for xi in raw_test_pred1500])
# RMSE
print("AdaBoost model Test RMSE (n=1500): ", np.sqrt(mean_squared_error(y_test, test_pred1500)))
print("AdaBoost model Test RMSE (n=2000): ", np.sqrt(mean_squared_error(y_test, test_pred2000)))AdaBoost model Test RMSE (n=1500): 0.6586699885725429
AdaBoost model Test RMSE (n=2000): 0.6667948594698258
3.8 Gradient Boosting
By Anastasia Wei
# define the range of n_estimators
def get_models():
models = dict()
n_trees = [5, 10, 50, 100, 500, 1000, 1500, 2000, 2500]
for n in n_trees:
models[str(n)] = GradientBoostingRegressor(n_estimators=n,random_state=1,loss='huber')
return models
# evaluate the model using cross-validation
def evaluate_model(model, X, y):
cv = KFold(n_splits=5, shuffle=True, random_state=1)
scores = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X_train, y_train)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels= names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15);>5 0.793 (0.021)
>10 0.753 (0.020)
>50 0.694 (0.014)
>100 0.680 (0.013)
>500 0.662 (0.010)
>1000 0.660 (0.009)
>1500 0.660 (0.008)
>2000 0.662 (0.007)
>2500 0.662 (0.009)
# 4 fold cv coarse grid search based on previous search
start_time = time.time()
model = GradientBoostingRegressor(random_state = 1, loss='huber')
grid = dict()
grid['n_estimators'] = [1200, 1400, 1600, 1800]
grid['learning_rate'] = [0.1, 0.2, 0.3]
grid['max_depth'] = [8, 10, 12, 14]
grid['subsample'] = [0.4, 0.6, 0.8, 1]
cv = KFold(n_splits = 4, shuffle=True, random_state=1)
grid_search = RandomizedSearchCV(estimator = model,
param_distributions = grid,
n_jobs = -1, cv = cv,
n_iter = 50,
scoring = 'neg_mean_squared_error',
verbose = True)
grid_result = grid_search.fit(X_train, y_train)
print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")Fitting 4 folds for each of 50 candidates, totalling 200 fits
Best: 0.623735 using {'subsample': 0.8, 'n_estimators': 1200, 'max_depth': 8, 'learning_rate': 0.1}
Time taken = 24.304628153642017 minutes
# visualize cv results
cv_results = pd.DataFrame(grid_result.cv_results_)
fig, axes = plt.subplots(1, 4, figsize = (16, 4))
plt.subplots_adjust(wspace = 0.3)
axes[0].plot(cv_results.param_n_estimators, np.sqrt(-cv_results.mean_test_score), 'o')
axes[0].set_xlabel('param_n_estimators')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_learning_rate, np.sqrt(-cv_results.mean_test_score), 'o')
axes[1].set_xlabel('Learning Rate')
axes[1].set_ylabel('K-fold RMSE')
axes[2].plot(cv_results.param_subsample, np.sqrt(-cv_results.mean_test_score), 'o')
axes[2].set_xlabel('Subsample')
axes[2].set_ylabel('K-fold RMSE');
axes[3].plot(cv_results.param_max_depth, np.sqrt(-cv_results.mean_test_score), 'o')
axes[3].set_xlabel('max_depth')
axes[3].set_ylabel('K-fold RMSE');# computing train and test rmse based on initial optimization
model = GradientBoostingRegressor(n_estimators = 1200, loss = 'huber', max_depth = 8,
learning_rate = 0.1, subsample = 0.8,
random_state = 45).fit(X_train,y_train)
pred = model.predict(X_test)
print('The test RMSE is', np.sqrt(mean_squared_error(y_test, np.around(pred))))The test RMSE is 0.6782329983125268
# computing train and test rmse based on initial optimization
model = GradientBoostingRegressor(n_estimators = 1400, loss = 'huber', max_depth = 8,
learning_rate = 0.1, subsample = 0.8,
random_state = 45).fit(X_train,y_train)
pred = model.predict(X_test)
print('The test RMSE is', np.sqrt(mean_squared_error(y_test, np.around(pred))))The test RMSE is 0.6776656765919086
# finer grid search based on previous optimization
start_time = time.time()
model = GradientBoostingRegressor(random_state = 1, loss='huber')
grid = dict()
grid['n_estimators'] = [1300, 1350, 1400, 1450, 1500]
grid['learning_rate'] = [0.8, 0.1, 0.15]
grid['max_depth'] = [8, 9, 10]
grid['subsample'] = [0.7, 0.8, 0.9, 1]
cv = KFold(n_splits = 4, shuffle=True, random_state=1)
grid_search = RandomizedSearchCV(estimator = model,
param_distributions = grid,
n_jobs = -1, cv = cv,
n_iter = 50,
scoring = 'neg_mean_squared_error',
verbose = True)
grid_result = grid_search.fit(X_train, y_train)
print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")Fitting 4 folds for each of 50 candidates, totalling 200 fits
Best: 0.620413 using {'subsample': 0.8, 'n_estimators': 1300, 'max_depth': 9, 'learning_rate': 0.1}
Time taken = 18.446675260861713 minutes
# visualize finer grid search results
cv_results = pd.DataFrame(grid_result.cv_results_)
fig, axes = plt.subplots(1, 4, figsize = (16, 4))
plt.subplots_adjust(wspace = 0.3)
axes[0].plot(cv_results.param_n_estimators, np.sqrt(-cv_results.mean_test_score), 'o')
axes[0].set_xlabel('param_n_estimators')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_learning_rate, np.sqrt(-cv_results.mean_test_score), 'o')
axes[1].set_xlabel('Learning Rate')
axes[1].set_ylabel('K-fold RMSE')
axes[2].plot(cv_results.param_subsample, np.sqrt(-cv_results.mean_test_score), 'o')
axes[2].set_xlabel('Subsample')
axes[2].set_ylabel('K-fold RMSE');
axes[3].plot(cv_results.param_max_depth, np.sqrt(-cv_results.mean_test_score), 'o')
axes[3].set_xlabel('max_depth')
axes[3].set_ylabel('K-fold RMSE');# model with optimized parameter
model = GradientBoostingRegressor(n_estimators = 1300, loss = 'huber', max_depth = 9,
learning_rate = 0.1, subsample = 0.8,
random_state = 1).fit(X_train, y_train)
pred = model.predict(X_test)
# rounded rmse
print('The test RMSE is', np.sqrt(mean_squared_error(y_test, np.around(pred))))The test RMSE is 0.6713934992009014
Manual Tunning & Testing
# best model
model = GradientBoostingRegressor(n_estimators = 1300, loss = 'huber', max_depth = 9,
learning_rate = 0.08, subsample = 0.85,
random_state = 1).fit(X_train, y_train)
pred = model.predict(X_test)
# rounded rmse
print('The test RMSE is', np.sqrt(mean_squared_error(y_test, np.around(pred))))The test RMSE is 0.6586699885725429
# feature importance
features_df = pd.DataFrame(zip(X_train.columns, list(model.feature_importances_)),
columns =['Predictor', 'Importance'])
features_df.sort_values(by = 'Importance', ascending = False)| Predictor | Importance | |
|---|---|---|
| 10 | alcohol | 0.250846 |
| 1 | volatile acidity | 0.124307 |
| 5 | free sulfur dioxide | 0.080374 |
| 9 | sulphates | 0.078176 |
| 6 | total sulfur dioxide | 0.077565 |
| 3 | residual sugar | 0.072411 |
| 8 | pH | 0.069419 |
| 4 | chlorides | 0.067845 |
| 7 | density | 0.066263 |
| 2 | citric acid | 0.058864 |
| 0 | fixed acidity | 0.053085 |
| 12 | type_white | 0.000499 |
| 11 | type_red | 0.000347 |
Other attemps
model = GradientBoostingRegressor(n_estimators = 1300, loss = 'huber', max_depth = 7,
learning_rate = 0.08, subsample = 0.85,
random_state = 1).fit(X_train, y_train)
pred = model.predict(X_test)
# rounded rmse
print('The test RMSE is', np.sqrt(mean_squared_error(y_test, np.around(pred))))The test RMSE is 0.6713934992009014
model = GradientBoostingRegressor(n_estimators = 1400, loss = 'huber', max_depth = 9,
learning_rate = 0.07, subsample = 0.8,
random_state = 1).fit(X_train, y_train)
pred = model.predict(X_test)
# rounded rmse
print('The test RMSE is', np.sqrt(mean_squared_error(y_test, np.around(pred))))The test RMSE is 0.6719661163618754
3.9 XGBoost
By Amy Wang
Initial broad sweep of the parameter space, for 7 parameters: max_depth, learning_rate, reg_lambda, n_estimators, gamma, subsample, colsample_bytree. I fit the model with the optimal parameters found by RandomizedSearchCV and plot how well the model predicts both train and test data.
## Initial Broad Sweep of the Parameter Space
param_grid = {'max_depth': [4, 6, 8],
'learning_rate': [0.0001, 0.001, 0.01, 0.1],
'reg_lambda':[0, 10, 100],
'n_estimators':[1000, 2000, 3000, 4000],
'gamma': [0, 5, 10],
'subsample': [0.3, 0.5, 1.0],
"colsample_bytree": [0.5, 0.75, 1.0]}
xgboost = RandomizedSearchCV(estimator = XGBRegressor(random_state=1),
param_distributions = param_grid,
verbose = 1,
n_jobs = -1,
n_iter = 20,
cv = 5).fit(X_train, y_train)
xgboost.best_params_
# {'subsample': 1.0,
# 'reg_lambda': 0,
# 'n_estimators': 2000,
# 'max_depth': 8,
# 'learning_rate': 0.01,
# 'gamma': 0,
# 'colsample_bytree': 1.0}Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'subsample': 1.0,
'reg_lambda': 0,
'n_estimators': 2000,
'max_depth': 8,
'learning_rate': 0.01,
'gamma': 0,
'colsample_bytree': 1.0}
init_xgboost = XGBRegressor(n_estimators = 2000, learning_rate = 0.01, subsample = 1.0, reg_lambda = 0, max_depth = 8, gamma = 0, colsample_bytree = 1.0, random_state = 1)
init_xgboost.fit(X_train, y_train)
pred_series = pd.Series(init_xgboost.predict(X_test))
pred_series_rounded = pred_series.round()
mean_squared_error(y_test, pred_series_rounded, squared = False)0.6799321233091523
sns.regplot(x = init_xgboost.predict(X_train).round(), y = y_train)<AxesSubplot:ylabel='quality'>
sns.regplot(x = init_xgboost.predict(X_test).round(), y = y_test)<AxesSubplot:ylabel='quality'>
I visualize the 5-fold cross-validated RMSE for each of the parameters to understand if it’s possible to manually optimize the model any further.
# Visualizing the parameter space to understand how to narrow subsequent searches
cv_results = pd.DataFrame(xgboost.cv_results_)
fig, axes = plt.subplots(1, 3, figsize = (20,5))
plt.subplots_adjust(wspace = 0.2)
axes[0].plot(cv_results.param_subsample, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('Subsample')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_gamma, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('Gamma')
axes[1].set_ylabel('K-fold RMSE')
axes[2].plot(cv_results.param_learning_rate, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('Learning Rate')
axes[2].set_ylabel('K-fold RMSE')Text(0, 0.5, 'K-fold RMSE')
fig, axes = plt.subplots(1, 4, figsize = (20,5))
plt.subplots_adjust(wspace = 0.2)
axes[0].plot(cv_results.param_max_depth, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_n_estimators, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('Estimators')
axes[1].set_ylabel('K-fold RMSE')
axes[2].plot(cv_results.param_reg_lambda, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('Lambda')
axes[2].set_ylabel('K-fold RMSE')
axes[3].plot(cv_results.param_colsample_bytree, -cv_results.mean_test_score, 'o')
axes[3].set_xlabel('colsample_bytree')
axes[3].set_ylabel('K-fold RMSE')Text(0, 0.5, 'K-fold RMSE')
# Manual tuning based on the plots. RMSE does not improve.
init_xgboost = XGBRegressor(n_estimators = 4000, learning_rate = 0.01, subsample = 1.0, reg_lambda = 100, max_depth = 8, gamma = 0, colsample_bytree = 1.0, random_state = 1)
init_xgboost.fit(X_train, y_train)
pred_series = pd.Series(init_xgboost.predict(X_test))
pred_series_rounded = pred_series.round()
mean_squared_error(y_test, pred_series_rounded, squared = False)0.6855654600401044
Second RandomizedSearchCV grid search with a constant learning rate of 0.01.
# Grid search with learning_rate = 0.01
param_grid = {'max_depth': [4, 5, 6, 7, 8, 9],
'learning_rate': [0.01],
'reg_lambda':[0, 10, 20, 30, 40, 70, 100],
'n_estimators':[3000, 4000, 5000],
'gamma': [0, 3, 5, 7, 10],
'subsample': [0.5, 1.0],
"colsample_bytree": [0.6, 0.75, 0.8, 0.9]}
xgboost2 = RandomizedSearchCV(estimator = XGBRegressor(random_state=1),
param_distributions = param_grid,
verbose = 1,
n_jobs = -1,
n_iter = 50,
cv = 5).fit(X_train, y_train)
xgboost2.best_params_
# {'subsample': 0.5,
# 'reg_lambda': 0,
# 'n_estimators': 5000,
# 'max_depth': 9,
# 'learning_rate': 0.01,
# 'gamma': 0,
# 'colsample_bytree': 0.75}Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'subsample': 0.5,
'reg_lambda': 0,
'n_estimators': 5000,
'max_depth': 9,
'learning_rate': 0.01,
'gamma': 0,
'colsample_bytree': 0.75}
cv_results = pd.DataFrame(xgboost2.cv_results_)
fig, axes = plt.subplots(1, 3, figsize = (20,5))
plt.subplots_adjust(wspace = 0.2)
axes[0].plot(cv_results.param_subsample, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('Subsample')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_gamma, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('Gamma')
axes[1].set_ylabel('K-fold RMSE')
axes[2].plot(cv_results.param_colsample_bytree, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('colsample_bytree')
axes[2].set_ylabel('K-fold RMSE')Text(0, 0.5, 'K-fold RMSE')
fig, axes = plt.subplots(1, 3, figsize = (20,5))
plt.subplots_adjust(wspace = 0.2)
axes[0].plot(cv_results.param_max_depth, -cv_results.mean_test_score, 'o')
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_n_estimators, -cv_results.mean_test_score, 'o')
axes[1].set_xlabel('Estimators')
axes[1].set_ylabel('K-fold RMSE')
axes[2].plot(cv_results.param_reg_lambda, -cv_results.mean_test_score, 'o')
axes[2].set_xlabel('Lambda')
axes[2].set_ylabel('K-fold RMSE')Text(0, 0.5, 'K-fold RMSE')
# Fit based on the best combination of parameters from above
# {'subsample': 0.5,
# 'reg_lambda': 0,
# 'n_estimators': 5000,
# 'max_depth': 9,
# 'learning_rate': 0.01,
# 'gamma': 0,
# 'colsample_bytree': 0.75}
optimized_xgboost2 = XGBRegressor(n_estimators = 5000, learning_rate = 0.01, subsample = 0.5, reg_lambda = 0, max_depth = 9, gamma = 0, colsample_bytree = 0.75, random_state = 1)
optimized_xgboost2.fit(X_train, y_train)
mean_squared_error(y_test, optimized_xgboost2.predict(X_test).round(), squared = False)0.6598368096617643
Plotting the Actual vs. Predicted Values of Quality
regplot = sns.regplot(x = optimized_xgboost2.predict(X_train).round(), y = y_train)
regplot.set(xlabel ="Predicted Values", ylabel = "Actual Values", title ='Actual vs. Predicted Values of Quality (Training)')[Text(0.5, 0, 'Predicted Values'),
Text(0, 0.5, 'Actual Values'),
Text(0.5, 1.0, 'Actual vs. Predicted Values of Quality (Training)')]
regplot = sns.regplot(x = optimized_xgboost2.predict(X_test).round(), y = y_test)
regplot.set(xlabel ="Predicted Values", ylabel = "Actual Values", title ='Actual vs. Predicted Values of Quality (Test)')[Text(0.5, 0, 'Predicted Values'),
Text(0, 0.5, 'Actual Values'),
Text(0.5, 1.0, 'Actual vs. Predicted Values of Quality (Test)')]
features_dict = optimized_xgboost2.get_booster().get_score(importance_type='gain')
features = pd.DataFrame(list(zip(features_dict.keys(), features_dict.values())), columns =['Feature_name', 'Importance'])
features.sort_values("Importance", ascending = False).reset_index().drop(columns = "index")| Feature_name | Importance | |
|---|---|---|
| 0 | type_red | 0.765248 |
| 1 | type_white | 0.414878 |
| 2 | alcohol | 0.392954 |
| 3 | sulphates | 0.171820 |
| 4 | density | 0.168505 |
| 5 | free sulfur dioxide | 0.166862 |
| 6 | pH | 0.154604 |
| 7 | total sulfur dioxide | 0.152681 |
| 8 | chlorides | 0.129691 |
| 9 | residual sugar | 0.127429 |
| 10 | citric acid | 0.108015 |
| 11 | volatile acidity | 0.107656 |
| 12 | fixed acidity | 0.055337 |
Attempting Bayesian Optimization, to compare to RandomizedSearchCV
# Bayesian Optimization of XGBoost Hyperparameters
## To see if Bayesian Optimization will do better than Random Grid Search CV
pbounds = {
'learning_rate': (0.001, 1.0),
'n_estimators': (100, 4000),
'max_depth': (3,10),
'subsample': (0.3, 1.0),
'gamma': (0, 10),
'reg_lambda': (0, 100)}
def xgboost_hyper_param(learning_rate,
n_estimators,
max_depth,
subsample,
gamma,
reg_lambda):
max_depth = int(max_depth)
n_estimators = int(n_estimators)
clf = XGBRegressor(
max_depth = max_depth,
learning_rate = learning_rate,
n_estimators = n_estimators,
gamma = gamma)
return np.mean(cross_val_score(clf, X_train, y_train, cv = 5, scoring='neg_mean_squared_error'))
optimizer = BayesianOptimization(
f = xgboost_hyper_param,
pbounds = pbounds,
random_state = 1,
)optimizer.maximize(init_points = 5, n_iter = 10)
print("Best result: {}; f(x) = {}.".format(optimizer.max["params"], optimizer.max["target"]))
# Best result: {'colsample': 0.926371203066647, 'gamma': 4.004543074048512, 'learning_rate': 0.105068050089807, 'max_depth': 6.815443879731223, 'n_estimators': 1315.404330702919, 'subsample': 0.4879450393211324}; f(x) = -0.466861276075143.| iter | target | colsample | gamma | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| 1 | -0.4945 | 0.7085 | 7.203 | 0.01011 | 5.116 | 525.6 | 0.3646 |
| 2 | -0.4759 | 0.5931 | 3.456 | 0.4028 | 6.772 | 1.316e+03 | 0.7797 |
| 3 | -0.4964 | 0.6022 | 8.781 | 0.03711 | 7.693 | 1.31e+03 | 0.6911 |
| 4 | -0.5484 | 0.5702 | 1.981 | 0.8027 | 9.778 | 1.009e+03 | 0.7846 |
| 5 | -0.5041 | 0.9382 | 8.946 | 0.09419 | 3.273 | 592.5 | 0.9147 |
| 6 | -0.4669 | 0.9264 | 4.005 | 0.1051 | 6.815 | 1.315e+03 | 0.4879 |
| 7 | -0.499 | 0.8711 | 5.935 | 0.5099 | 3.894 | 1.314e+03 | 0.3997 |
| 8 | -0.4822 | 0.8882 | 6.403 | 0.4975 | 8.614 | 1.316e+03 | 0.6406 |
| 9 | -0.5233 | 0.7391 | 3.505 | 0.8952 | 7.737 | 1.312e+03 | 0.3287 |
| 10 | -0.4892 | 0.6288 | 3.743 | 0.6291 | 6.274 | 1.319e+03 | 0.5374 |
| 11 | -0.5191 | 0.9329 | 5.617 | 0.8387 | 7.073 | 1.316e+03 | 0.6822 |
| 12 | -0.4856 | 0.5393 | 3.801 | 0.5747 | 5.933 | 1.315e+03 | 0.3307 |
| 13 | -0.5015 | 0.5776 | 2.517 | 0.631 | 6.298 | 1.317e+03 | 0.9844 |
| 14 | -0.4886 | 0.5334 | 4.53 | 0.486 | 8.605 | 1.315e+03 | 0.6784 |
| 15 | -0.5222 | 0.5267 | 4.66 | 0.9183 | 9.579 | 1.317e+03 | 0.8874 |
=================================================================================================
Best result: {'colsample': 0.926371203066647, 'gamma': 4.004543074048512, 'learning_rate': 0.105068050089807, 'max_depth': 6.815443879731223, 'n_estimators': 1315.404330702919, 'subsample': 0.4879450393211324}; f(x) = -0.466861276075143.
# Implementing the Bayesian Optimizer model
xgb_model_bayesOpt = XGBRegressor(colsample = 0.926371203066647,
subsample = 0.4879450393211324,
n_estimators = 1315,
max_depth = 7,
learning_rate = 0.105068050089807,
gamma = 4.004543074048512).fit(X_train, y_train)[13:23:33] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-cpython-38/xgboost/src/learner.cc:767:
Parameters: { "colsample" } are not used.
mean_squared_error(y_test, xgb_model_bayesOpt.predict(X_test).round(), squared = False)0.739022223044643
3.10 CatBoost, and LightGBM
By Anastasia Wei
# untuned catboost model with default parameters
model_cat = CatBoostRegressor(verbose = False).fit(X_train, y_train)
print('Train RMSE =', np.sqrt(mean_squared_error(np.around(model_cat.predict(X_train)),y_train)))
print('Test RMSE =', np.sqrt(mean_squared_error(np.around(model_cat.predict(X_test)),y_test)))Train RMSE = 0.4760979229626765
Test RMSE = 0.7130648907789097
# untuned LGBM model with default parameters
model_lgbm = LGBMRegressor().fit(X_train, y_train)
print('Train RMSE =', np.sqrt(mean_squared_error(np.around(model_lgbm.predict(X_train)),y_train)))
print('Test RMSE =', np.sqrt(mean_squared_error(np.around(model_lgbm.predict(X_test)),y_test)))Train RMSE = 0.5210592961182009
Test RMSE = 0.7258946734362203
4 Model Ensemble
By Anastasia Wei
Put code with comments. The comments should explain the code such that it can be easily understood. You may put text (in a markdown cell) before a large chunk of code to explain the overall purpose of the code, if it is not intuitive. Put the name of the person / persons who contributed to each code chunk / set of code chunks.
Loading and predicting using each of the individual models from previous model section
model_lasso = Lasso(alpha = 0.00036584035717135963).fit(X_train, y_train)
lasso_pred_train = model_lasso.predict(X_train)
lasso_pred_test = model_lasso.predict(X_test)
print("lasso train RMSE =", np.sqrt(mean_squared_error(y_train, np.around(lasso_pred_train))))
print("lasso test RMSE =", np.sqrt(mean_squared_error(y_test, np.around(lasso_pred_test))))lasso train RMSE = 0.7818658578951756
lasso test RMSE = 0.808131748588634
model_ridge = Ridge(alpha = 0.01636987568069109).fit(X_train, y_train)
ridge_pred_train = model_ridge.predict(X_train)
ridge_pred_test = model_ridge.predict(X_test)
print("ridge train RMSE =", np.sqrt(mean_squared_error(y_train, np.around(ridge_pred_train))))
print("ridge test RMSE =", np.sqrt(mean_squared_error(y_test, np.around(ridge_pred_test))))ridge train RMSE = 0.7892143827427598
ridge test RMSE = 0.8166535844059444
model_gb = GradientBoostingRegressor(n_estimators = 1300, loss = 'huber', max_depth = 9,
learning_rate = 0.08, subsample = 0.85,
random_state = 1).fit(X_train, y_train)
gb_pred_train = model_gb.predict(X_train)
gb_pred_test = model_gb.predict(X_test)
print("Gradient boost train RMSE =", np.sqrt(mean_squared_error(np.around(gb_pred_train), y_train)))
print("Gradient boost test RMSE =", np.sqrt(mean_squared_error(np.around(gb_pred_test), y_test)))Gradient boost train RMSE = 0.02402615469220623
Gradient boost test RMSE = 0.6586699885725429
model_xgb = XGBRegressor(n_estimators = 4050, max_depth = 7,
learning_rate = 0.01, subsample = 0.5,
reg_lambda = 10, gamma = 0, colsample_bytree = 0.6,
random_state = 1).fit(X_train, y_train)
xgb_pred_train = model_xgb.predict(X_train)
xgb_pred_test = model_xgb.predict(X_test)
print("xgb train RMSE =", np.sqrt(mean_squared_error(np.around(xgb_pred_train), y_train)))
print("xgb test RMSE =", np.sqrt(mean_squared_error(np.around(xgb_pred_test), y_test)))xgb train RMSE = 0.16587909616044325
xgb test RMSE = 0.6753916242846414
model_ada = AdaBoostRegressor(estimator = DecisionTreeRegressor(max_depth=13),
n_estimators = 1500,
learning_rate = 1.5,
random_state = 45).fit(X_train,y_train)
ada_pred_train = model_ada.predict(X_train)
ada_pred_test = model_ada.predict(X_test)
print("Adaboost train RMSE =", np.sqrt(mean_squared_error(np.around(ada_pred_train), y_train)))
print("Adaboost test RMSE =", np.sqrt(mean_squared_error(np.around(ada_pred_test), y_test)))Adaboost train RMSE = 0.0
Adaboost test RMSE = 0.6604194471348085
model_rf = RandomForestRegressor(n_estimators= 3000,
max_features = 7,
max_samples = 5000,
max_depth = 50,
random_state = 1,
n_jobs = -1).fit(X_train, y_train)
rf_pred_train = model_rf.predict(X_train)
rf_pred_test = model_rf.predict(X_test)
print("Random forest train RMSE =", np.sqrt(mean_squared_error(np.around(rf_pred_train), y_train)))
print("Random forest test RMSE =", np.sqrt(mean_squared_error(np.around(rf_pred_test), y_test)))Random forest train RMSE = 0.2029226514289605
Random forest test RMSE = 0.6690981300917733
model_dt = DecisionTreeRegressor(max_leaf_nodes = 52,
max_depth = 6,
min_samples_leaf = 8,
random_state = 45).fit(X_train, y_train)
dt_pred_train = model_dt.predict(X_train)
dt_pred_test = model_dt.predict(X_test)
print("Decision tree train RMSE =", np.sqrt(mean_squared_error(np.around(dt_pred_train), y_train)))
print("Decision tree test RMSE =", np.sqrt(mean_squared_error(np.around(dt_pred_test), y_test)))Decision tree train RMSE = 0.7266339665201115
Decision tree test RMSE = 0.8043152845265822
model_cat = CatBoostRegressor(verbose = False).fit(X_train, y_train)
cat_pred_train = model_cat.predict(X_train)
cat_pred_test = model_cat.predict(X_test)
print("Catboost train RMSE =", np.sqrt(mean_squared_error(np.around(cat_pred_train), y_train)))
print("Catboost test RMSE =", np.sqrt(mean_squared_error(np.around(cat_pred_test), y_test)))Catboost train RMSE = 0.4760979229626765
Catboost test RMSE = 0.7130648907789097
model_lgbm = LGBMRegressor().fit(X_train, y_train)
lgbm_pred_train = model_lgbm.predict(X_train)
lgbm_pred_test = model_lgbm.predict(X_test)
print("lgbm train RMSE =", np.sqrt(mean_squared_error(np.around(lgbm_pred_train), y_train)))
print("lgbm test RMSE =", np.sqrt(mean_squared_error(np.around(lgbm_pred_test), y_test)))lgbm train RMSE = 0.5210592961182009
lgbm test RMSE = 0.7258946734362203
model_bag = BaggingRegressor(random_state = 1,
n_jobs = -1,
estimator = DecisionTreeRegressor(random_state = 1),
n_estimators = 1100,
max_samples = 0.75,
max_features = 1.0,
bootstrap_features = True,
bootstrap = False).fit(X_train, y_train)
bag_pred_train = model_bag.predict(X_train)
bag_pred_test = model_bag.predict(X_test)
print("bagging train RMSE =", np.sqrt(mean_squared_error(y_train, np.around(bag_pred_train))))
print("bagging test RMSE =", np.sqrt(mean_squared_error(y_test, np.around(bag_pred_test))))bagging train RMSE = 0.08773111263353295
bagging test RMSE = 0.6673714223613529
# MARS model
mars_model = Earth(max_terms = 800, max_degree = 7)
mars_model.fit(X_train, y_train)
pred_opt = mars_model.predict(X_test)
rounded_pred_opt = [round(p) for p in pred_opt]
rounded_pred_opt = pd.Series(rounded_pred_opt, name='quality')
rounded_pred_opt = rounded_pred_opt.to_frame().rename(columns={'quality': 'quality'})
residuals = y_test.quality - rounded_pred_opt.quality.values
residual_model = Earth()
residual_model.fit(rounded_pred_opt, residuals)
residuals_pred_unrounded = residual_model.predict(rounded_pred_opt)
residuals_pred = [round(p) for p in residuals_pred_unrounded]
residuals_pred_series = pd.Series(residuals_pred, name='quality')
pred_opt_train = mars_model.predict(X_train)
rounded_opt_train = pd.DataFrame(np.around(pred_opt_train), columns = ['quality'])
residuals_train = y_train.quality - rounded_opt_train.quality.values
residual_model.fit(rounded_opt_train, residuals_train)
residuals_pred_train = np.around(residual_model.predict(rounded_opt_train))
residuals_pred_series_train = pd.Series(residuals_pred_train, name='quality')
mars_pred_train = pd.concat([rounded_opt_train, residuals_pred_series_train], axis=1).sum(axis=1)
mars_pred_test = pd.concat([rounded_pred_opt, residuals_pred_series], axis=1).sum(axis=1)
print("MARS train RMSE =", np.sqrt(mean_squared_error(y_train, mars_pred_train)))
print("MARS test RMSE =", np.sqrt(mean_squared_error(y_test, mars_pred_test)))MARS train RMSE = 0.7600269381410235
MARS test RMSE = 0.7745966692414834
Casting the train and test predicted responses into data frame for ensemble model training.
train_df = pd.DataFrame(np.array([gb_pred_train, xgb_pred_train, ada_pred_train,
rf_pred_train, mars_pred_train, dt_pred_train,
cat_pred_train, lgbm_pred_train, bag_pred_train,
ridge_pred_train, lasso_pred_train]).transpose(),
columns = ['gb', 'xgb', 'ada', 'rf', 'mars', 'dt',
'cat', 'lgbm', 'bag', 'ridge', 'lasso'])test_df = pd.DataFrame(np.array([gb_pred_test, xgb_pred_test, ada_pred_test,
rf_pred_test, mars_pred_test, dt_pred_test,
cat_pred_test, lgbm_pred_test, bag_pred_test,
ridge_pred_test, lasso_pred_test]).transpose(),
columns = ['gb', 'xgb', 'ada', 'rf', 'mars', 'dt',
'cat', 'lgbm', 'bag','ridge', 'lasso'])4.1 Voting ensemble
# average predited response as voting ensemble
# only using the modesl with rmse < 0.7
pred_train = train_df[['xgb','ada','rf','gb','bag']].sum(axis = 1)/5
pred_test = test_df[['xgb','ada','rf','gb','bag']].sum(axis = 1)/5
print("Train RMSE = ", np.sqrt(mean_squared_error(np.around(pred_train), y_train)))
print("Test RMSE = ", np.sqrt(mean_squared_error(np.around(pred_test), y_test)))Train RMSE = 0.043865556316766474
Test RMSE = 0.6510346794496848
# making actual vs predicted plots
jitter_amount = 0.5
jittered_pred = pred_test + np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(pred_test))
x = np.linspace(3,9,1000)
plt.scatter(jittered_pred, y_test, alpha = 0.5)
plt.xlabel('Predicted Response', fontsize = 14)
plt.ylabel('Actual Response', fontsize = 14)
plt.title('Actual vs Predicted Response', fontsize = 14)
plt.plot(x, x, c = 'k');4.2 Stacking ensemble(s)
# linear regression model
linreg = LinearRegression().fit(train_df, y_train)
pred = linreg.predict(test_df)
print('RMSE for the linreg stacking ensemble =', np.sqrt(mean_squared_error(y_test, np.around(pred))))RMSE for the linreg stacking ensemble = 0.6713934992009014
# lasso model
alphas = 10**np.linspace(0, -3, 300)*0.5
lassocv = LassoCV(alphas = alphas, cv = 5, max_iter = 100000)
lassocv.fit(train_df, y_train)
print('opt alpha = ', lassocv.alpha_)
lasso = Lasso(alpha = lassocv.alpha_)
lasso.fit(train_df, y_train)
pred = lasso.predict(test_df)
print('RMSE for the lasso stacking ensemble =', np.sqrt(mean_squared_error(y_test, np.around(pred))))opt alpha = 0.0005
RMSE for the lasso stacking ensemble = 0.6598368096617643
# Mars model
mars_model = Earth(max_degree=1)
mars_model.fit(train_df, y_train)
pred = mars_model.predict(test_df)
print('RMSE for the mars stacking ensemble', np.sqrt(mean_squared_error(y_test, np.around(pred))))RMSE for the mars stacking ensemble 0.6592536572635639
# random forest model
start_time = time.time()
param_grid = {'n_estimators': [100],
'max_depth': [8, 10, 12, 14],
'max_leaf_nodes':[100, 500, 1000],
'max_features': [2, 4, 6, 8],
'max_samples': [1000, 2000, 3000]}
cv = KFold(n_splits = 4, shuffle=True, random_state=1)
optimal_params = RandomizedSearchCV(estimator = RandomForestRegressor(random_state=1),
param_distributions = param_grid, n_iter = 100,
scoring = 'neg_mean_squared_error',
n_jobs=-1, verbose = 1, cv = cv)
optimal_params.fit(train_df, y_train)
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation mse = ", -optimal_params.best_score_)
print("Time taken = ", round((time.time()-start_time)/60), " minutes")Fitting 4 folds for each of 100 candidates, totalling 400 fits
Optimal parameter values = {'n_estimators': 100, 'max_samples': 3000, 'max_leaf_nodes': 1000, 'max_features': 8, 'max_depth': 12}
Optimal cross validation mse = 0.0003126043406170428
Time taken = 0 minutes
rf_model = RandomForestRegressor(n_estimators = 100, max_samples = 3000,
max_leaf_nodes = 1000, max_features = 8,
verbose = False, max_depth = 12,
n_jobs= -1).fit(train_df, y_train)
print('Test RMSE', np.sqrt(mean_squared_error(y_test, np.around(rf_model.predict(test_df)))))Test RMSE 0.6492599337233598
# xgboost model
start_time = time.time()
param_grid = {'max_depth': [3, 4, 5, 6],
'learning_rate': [0.008, 0.01, 0.025, 0.05],
'reg_lambda':[0, 1, 5],
'n_estimators':[500, 600, 800, 1000],
'gamma': [0, 3, 5, 10],
'subsample': [0.5, 0.75, 1.0],
'colsample_bytree': [0.5, 0.75, 1.0]}
cv = KFold(n_splits = 4, shuffle = True, random_state=1)
optimal_params = RandomizedSearchCV(estimator= XGBRegressor(random_state=1),
param_distributions = param_grid, n_iter = 100,
scoring = 'neg_mean_squared_error',
n_jobs=-1, verbose = 1, cv = cv)
optimal_params.fit(train_df,y_train)
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation mse = ", -optimal_params.best_score_)
print("Time taken = ", round((time.time()-start_time)/60), " minutes")Fitting 4 folds for each of 100 candidates, totalling 400 fits
Optimal parameter values = {'subsample': 0.75, 'reg_lambda': 0, 'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.008, 'gamma': 0, 'colsample_bytree': 1.0}
Optimal cross validation mse = 3.4230875375944506e-06
Time taken = 1 minutes
xgb_model = XGBRegressor(subsample = 0.75, reg_lambda = 0, n_estimators = 1000,
max_depth = 6, learning_rate = 0.008, gamma = 0,
colsample_bytree = 1, random_state = 1).fit(train_df, y_train)
print('Test RMSE', np.sqrt(mean_squared_error(y_test, np.around(xgb_model.predict(test_df)))))Test RMSE 0.6753916242846414
4.3 Ensemble of ensembled models
# use a voting ensemble to ensemble the metalmodels
pred = (linreg.predict(test_df).flatten() + lasso.predict(test_df)
+ rf_model.predict(test_df)+ mars_model.predict(test_df)
+ xgb_model.predict(test_df))/5
print('rmse for the voting ensemble model:', np.sqrt(mean_squared_error(y_test, np.around(pred))))rmse for the voting ensemble model: 0.6610015710443334
# tuning thee rounding threshold
p = np.linspace(0, 1, 100)
rmses = []
for i in p:
base = np.floor(pred)
diff = pred - base
# cast to a list
new_pred = base + np.array([int(x == True) for x in (diff > i)])
rmse = np.sqrt(mean_squared_error(y_test, np.around(new_pred)))
rmses.append(rmse)plt.scatter(p, rmses)
plt.axvline(p[np.array(rmses).argmin()], ls = '--', c = 'r')
plt.xlabel('Threshold')
plt.ylabel('Test RMSE');# making new predictions based on the optimal threshold
opt_thresh = p[np.array(rmses).argmin()]
print('opt_thresh', opt_thresh)
new_pred = base + np.array([int(x == True) for x in (diff > opt_thresh)])
print('rmse for the voting ensemble model:', np.sqrt(mean_squared_error(y_test, np.around(new_pred))))opt_thresh 0.686868686868687
rmse for the voting ensemble model: 0.6421119001448987
jitter_amount = 0.5
jittered_pred = new_pred + np.random.uniform(low=-jitter_amount, high=jitter_amount, size=len(new_pred))
x = np.linspace(3,9,1000)
plt.scatter(jittered_pred, y_test, alpha = 0.4)
plt.xlabel('Predicted Response', fontsize = 14)
plt.ylabel('Actual Response', fontsize = 14)
plt.title('Actual vs Predicted Response', fontsize = 14)
plt.plot(x, x, c = 'k');